import numpy as np
import pandas as pd
import seaborn as sns
import random
from matplotlib import pyplot as plt
data = pd.read_csv("../data/to_model/new_york_listings_2024_to_model.csv");
data.head()
| id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | ... | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | number_of_reviews_ltm | license | rating | bedrooms | beds | baths | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 718037328155265207 | Rental unit in Queens · 5 bedrooms · 8 beds · ... | 204539 | Mark | Queens | Ridgewood | 40.70262 | -73.89800 | Entire home/apt | 471.0 | ... | 2022-11-03 | 0.07 | 9 | 365 | 0 | No License | No rating | 5 | 8 | 2 |
| 1 | 5536638 | Rental unit in Brooklyn · ★4.60 · 2 bedrooms ·... | 28709982 | Sidiq | Brooklyn | Williamsburg | 40.72027 | -73.95513 | Entire home/apt | 225.0 | ... | 2023-09-13 | 0.49 | 5 | 90 | 7 | No License | 4.60 | 2 | 2 | 1 |
| 2 | 4605840 | Rental unit in Brooklyn · ★4.93 · 1 bedroom · ... | 23788242 | Suzanne | Brooklyn | Bedford-Stuyvesant | 40.68670 | -73.94856 | Private room | 80.0 | ... | 2023-11-17 | 1.00 | 2 | 363 | 12 | No License | 4.93 | 1 | 1 | 1 |
| 3 | 594240338554500815 | Rental unit in New York · 1 bedroom · 1 bed · ... | 384559808 | Best Inns USA | Manhattan | East Village | 40.73221 | -73.98689 | Entire home/apt | 83.0 | ... | 2023-08-11 | 0.12 | 30 | 106 | 1 | No License | No rating | 1 | 1 | 1 |
| 4 | 26785 | Rental unit in Brooklyn · ★4.93 · 1 bedroom · ... | 42273 | Dani | Brooklyn | South Slope | 40.66860 | -73.98723 | Entire home/apt | 90.0 | ... | 2022-05-31 | 2.90 | 3 | 188 | 0 | No License | 4.93 | 1 | 1 | 1 |
5 rows × 22 columns
Z opisu ramki na kaggle wynika, że autor zrobił już część rzeczy za nas (np. wyekstrachował rating z kolumny name).
Dane orginalnie pochodzą ze strony: źródło, a znaczenie kolumn można znaleźć w pliku: oznaczenia (ze strony airbnb)
data.shape
(16606, 22)
Mamy 1606 wierszy i 22 kolumny.
Kolumny, które wymagają wytłumaczenia:
data.isnull().sum()
id 0 name 0 host_id 0 host_name 0 neighbourhood_group 0 neighbourhood 0 latitude 0 longitude 0 room_type 0 price 0 minimum_nights 0 number_of_reviews 0 last_review 0 reviews_per_month 0 calculated_host_listings_count 0 availability_365 0 number_of_reviews_ltm 0 license 0 rating 0 bedrooms 0 beds 0 baths 0 dtype: int64
Brak nulli :)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 16606 entries, 0 to 16605 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 16606 non-null int64 1 name 16606 non-null object 2 host_id 16606 non-null int64 3 host_name 16606 non-null object 4 neighbourhood_group 16606 non-null object 5 neighbourhood 16606 non-null object 6 latitude 16606 non-null float64 7 longitude 16606 non-null float64 8 room_type 16606 non-null object 9 price 16606 non-null float64 10 minimum_nights 16606 non-null int64 11 number_of_reviews 16606 non-null int64 12 last_review 16606 non-null object 13 reviews_per_month 16606 non-null float64 14 calculated_host_listings_count 16606 non-null int64 15 availability_365 16606 non-null int64 16 number_of_reviews_ltm 16606 non-null int64 17 license 16606 non-null object 18 rating 16606 non-null object 19 bedrooms 16606 non-null object 20 beds 16606 non-null int64 21 baths 16606 non-null object dtypes: float64(4), int64(8), object(10) memory usage: 2.8+ MB
Zobaczyć można, że niektóre kolumny mają dziwne typy, np zamiast inta bedrooms i baths mają object (zawierają jakieś stringi czasami). Będzie to trzeba naprawić i w tym celu zobaczmy jak to wygląda "od środka"
import re
def contains_only_numbers_and_dots(s):
pattern = r"^[0-9.]+$"
return re.match(pattern, s.strip()) is not None
print(data["rating"][0])
contains_only_numbers_and_dots(data["rating"][0])
No rating
False
Sprawdzmy jakie inne wartości od liczbowych są w rating, beds i baths Trzeba będzie się zastanowić potem co zrobić z tym jak oznaczyć te stringi jako liczby lub wgl co z tym zrobić
to_check = ["rating", "bedrooms", "baths"]
for i in to_check:
series = data[i].loc[~data[i].apply(contains_only_numbers_and_dots)]
print(series.value_counts())
print("------------------------------")
No rating 2865 New 135 Name: rating, dtype: int64 ------------------------------ Studio 1438 Name: bedrooms, dtype: int64 ------------------------------ Not specified 12 Name: baths, dtype: int64 ------------------------------
Wsm to zobaczmy te wiersze co mają baths == not specified
data.loc[data["baths"]=="Not specified"]["name"] # No możemy na spokojnie założyć teraz, że shared baths zaliczają sie do baths
67 Rental unit in Brooklyn · ★5.0 · 1 bedroom 166 Rental unit in New York · ★4.59 · Studio · 2 beds 4697 Rental unit in New York · ★4.17 · 1 bedroom · ... 6008 Townhouse in Brooklyn · ★4.87 · Studio 8626 Home in Queens · ★4.74 · 1 bedroom 9829 Rental unit in Brooklyn · ★4.52 · 1 bedroom · ... 12497 Boutique hotel in New York · Studio · 1 bed 12921 Home in Brooklyn · ★4.96 · 1 bedroom · 1 bed 13012 Rental unit in New York · ★4.86 · 1 bedroom · ... 14569 Hotel in New York · Studio · 1 bed 15425 Rental unit in Brooklyn · 1 bedroom 15807 Rental unit in Brooklyn · 1 bedroom · 1 bed Name: name, dtype: object
# Co jest zawarte w name (Można odpalić parę razy)
for i in range(20):
print(data["name"][random.randint(0, data.shape[0])])
Home in Queens · ★4.0 · 1 bedroom · 1 bed · 1 shared bath Townhouse in Queens · ★4.90 · 1 bedroom · 1 bed · 1 private bath Rental unit in Brooklyn · ★4.77 · 2 bedrooms · 2 beds · 1 bath Serviced apartment in Brooklyn · ★4.50 · 1 bedroom · 1 bed · 1 bath Rental unit in Brooklyn · 1 bedroom · 1 bed · 1 shared bath Rental unit in New York · ★4.50 · 1 bedroom · 1 bed · 2 shared baths Rental unit in Brooklyn · ★4.85 · 1 bedroom · 1 bed · 1 shared bath Rental unit in New York · ★4.75 · 1 bedroom · 1 bed · 1 bath Rental unit in New York · ★4.86 · 1 bedroom · 1 bed · 1 bath Rental unit in New York · ★4.61 · Studio · 1 bed · 1 bath Rental unit in Brooklyn · ★4.58 · 2 bedrooms · 2 beds · 1 bath Rental unit in New York · ★4.43 · 1 bedroom · 1 bed · 3.5 shared baths Rental unit in Queens · ★4.76 · 2 bedrooms · 4 beds · 1 bath Boutique hotel in New York · ★4.45 · 1 bedroom · 1 bed · 1 shared bath Rental unit in Brooklyn · ★4.70 · 1 bedroom · 2 beds · 1 bath Rental unit in Brooklyn · ★4.67 · 1 bedroom · 1 bed · 1 shared bath Home in Queens · ★3.80 · 1 bedroom · 1 bed · 2 shared baths Rental unit in Brooklyn · 1 bedroom · 1 bed · 1 bath Rental unit in New York · 1 bedroom · 2 beds · 1 bath Rental unit in New York · ★4.38 · 1 bedroom · 1 bed · 1 bath
Zostawilibysmy jednak kolumny name i host_name/id jako, że:
# przyjrzyjmy się host_id
data["host_id"].value_counts() # widać, ze jednak nie ma wiele firm dających na airbnb możliwość wynajęcia (czyli można wywalać potem)
51501835 107
162280872 103
61391963 96
107434423 81
19303369 72
...
151692758 1
110346058 1
1388987 1
274605727 1
478704134 1
Name: host_id, Length: 10458, dtype: int64
ostatecznie porzucimy name, host_name i host_id
data = data.drop(["id"], axis = 1)
data = data.drop(["name", "host_id", "host_name"], axis = 1)
data
| neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | number_of_reviews_ltm | license | rating | bedrooms | beds | baths | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Queens | Ridgewood | 40.702620 | -73.898000 | Entire home/apt | 471.0 | 30 | 1 | 2022-11-03 | 0.07 | 9 | 365 | 0 | No License | No rating | 5 | 8 | 2 |
| 1 | Brooklyn | Williamsburg | 40.720270 | -73.955130 | Entire home/apt | 225.0 | 30 | 52 | 2023-09-13 | 0.49 | 5 | 90 | 7 | No License | 4.60 | 2 | 2 | 1 |
| 2 | Brooklyn | Bedford-Stuyvesant | 40.686700 | -73.948560 | Private room | 80.0 | 30 | 101 | 2023-11-17 | 1.00 | 2 | 363 | 12 | No License | 4.93 | 1 | 1 | 1 |
| 3 | Manhattan | East Village | 40.732210 | -73.986890 | Entire home/apt | 83.0 | 30 | 2 | 2023-08-11 | 0.12 | 30 | 106 | 1 | No License | No rating | 1 | 1 | 1 |
| 4 | Brooklyn | South Slope | 40.668600 | -73.987230 | Entire home/apt | 90.0 | 30 | 479 | 2022-05-31 | 2.90 | 3 | 188 | 0 | No License | 4.93 | 1 | 1 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16601 | Brooklyn | Bedford-Stuyvesant | 40.686001 | -73.943693 | Entire home/apt | 135.0 | 30 | 15 | 2023-01-10 | 0.97 | 1 | 262 | 1 | No License | 4.87 | 3 | 3 | 1 |
| 16602 | Queens | Corona | 40.745100 | -73.864880 | Private room | 68.0 | 30 | 16 | 2023-10-16 | 2.68 | 2 | 180 | 16 | No License | 4.69 | 1 | 1 | 1 |
| 16603 | Bronx | Allerton | 40.867770 | -73.860180 | Entire home/apt | 100.0 | 30 | 30 | 2023-10-07 | 2.21 | 5 | 365 | 24 | No License | 4.73 | 1 | 1 | 1 |
| 16604 | Manhattan | Harlem | 40.830791 | -73.949509 | Private room | 50.0 | 30 | 19 | 2023-12-10 | 0.21 | 4 | 42 | 1 | No License | 4.84 | 1 | 1 | 1.5 |
| 16605 | Manhattan | Midtown | 40.762250 | -73.978280 | Entire home/apt | 130.0 | 30 | 2 | 2023-06-01 | 0.20 | 5 | 365 | 2 | No License | No rating | 1 | 1 | 1 |
16606 rows × 18 columns
data.describe()
| latitude | longitude | price | minimum_nights | number_of_reviews | reviews_per_month | calculated_host_listings_count | availability_365 | number_of_reviews_ltm | beds | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 16606.000000 | 16606.000000 | 16606.000000 | 16606.000000 | 16606.000000 | 16606.000000 | 16606.000000 | 16606.000000 | 16606.000000 | 16606.00000 |
| mean | 40.726740 | -73.939112 | 190.328074 | 28.619716 | 42.664158 | 1.256612 | 18.554077 | 205.954113 | 10.804649 | 1.72799 |
| std | 0.060366 | 0.061366 | 1134.545935 | 35.219921 | 74.116856 | 1.903427 | 69.620276 | 135.066607 | 21.126468 | 1.22619 |
| min | 40.500314 | -74.249840 | 10.000000 | 1.000000 | 1.000000 | 0.010000 | 1.000000 | 0.000000 | 0.000000 | 1.00000 |
| 25% | 40.684119 | -73.980840 | 80.000000 | 30.000000 | 4.000000 | 0.210000 | 1.000000 | 87.000000 | 1.000000 | 1.00000 |
| 50% | 40.722935 | -73.949650 | 125.000000 | 30.000000 | 14.000000 | 0.650000 | 2.000000 | 214.000000 | 3.000000 | 1.00000 |
| 75% | 40.763025 | -73.917362 | 199.000000 | 30.000000 | 49.000000 | 1.800000 | 5.000000 | 354.000000 | 15.000000 | 2.00000 |
| max | 40.911147 | -73.713650 | 100000.000000 | 1250.000000 | 1865.000000 | 75.490000 | 713.000000 | 365.000000 | 1075.000000 | 42.00000 |
Z ciekawych rzeczy mozna spojrzec na:
data["license"].value_counts() # Trzeba zbinować do jednej grupy poza no license i exemp
No License 14057
Exempt 1702
OSE-STRREG-0000068 10
OSE-STRREG-0008664 6
OSE-STRREG-0000003 4
...
OSE-STRREG-0000152 1
OSE-STRREG-0000424 1
OSE-STRREG-0000956 1
OSE-STRREG-0000387 1
OSE-STRREG-0000341 1
Name: license, Length: 725, dtype: int64
def lic_mapper(a):
if a == "No License" or a == "Exempt":
return 0
else:
return 1
data["license"] = data["license"].map(lic_mapper) # Jeśli ma licencję to 1, jeśli nie to 0
data["license"].value_counts()
0 15759 1 847 Name: license, dtype: int64
na wykresach odcinamy 5% największych wartości, bo inaczej wykresy są tragiczne
numerical = ["price", "minimum_nights", "number_of_reviews", "reviews_per_month", "calculated_host_listings_count", "availability_365", "number_of_reviews_ltm", "beds"]
hidden_numerical = ["rating", "bedrooms", "baths"]
# Odcięte 5% największych obserwacji (inaczej wykresy są tragiczne)
fig, axs = plt.subplots(6, 2, figsize = (12, 36))
axs = axs.flatten()
bins = [30, 20, 20, 20, 20, 20, 20, 20, 30, 20, 20]
# nasze numeryczne
for i, col in enumerate(numerical):
series = data[col].loc[data[col] <= np.quantile(data[col], 0.95)]
sns.histplot(x = series, ax=axs[i], color = "brown", kde = True, bins=bins[i]).set(title = col + " histogram")
# te ze stringami
for i, col in enumerate(hidden_numerical):
series = data[col].loc[data[col].apply(contains_only_numbers_and_dots)].apply(float)
series = series.loc[series <= np.quantile(series, 0.95)]
sns.histplot(x = series, ax=axs[8+i], color = "brown", kde = True, bins = bins[i+8]).set(title = col + " histogram")
plt.tight_layout()
plt.show()
Co oznacza 1.5 łazienki?? (Potem będę to castował do jednej najprawdopodobniej)
fig, axs = plt.subplots(6, 2, figsize = (12, 36))
axs = axs.flatten()
# nasze numeryczne
for i, col in enumerate(numerical):
sns.boxplot(data = data, x = col, ax=axs[i], color = "brown").set(title = col + " boxplot")
# te ze stringami
for i, col in enumerate(hidden_numerical):
series = data[col].loc[data[col].apply(contains_only_numbers_and_dots)].apply(float)
sns.boxplot(x = series, ax=axs[8+i], color = "brown").set(title = col + " boxplot")
plt.tight_layout()
plt.show()
Widzimy, że będzie problem z outlierami, ale tym zajmiemy się przy kolejnych etapach
data.columns.difference(numerical)
Index(['baths', 'bedrooms', 'last_review', 'latitude', 'license', 'longitude',
'neighbourhood', 'neighbourhood_group', 'rating', 'room_type'],
dtype='object')
nonumerical = ['license', 'neighbourhood', 'neighbourhood_group', 'room_type']
# Funkcja pomocnicza do pokazywania procentów
def without_hue(ax, feature):
total = len(feature)
for p in ax.patches:
percentage = '{:.1f}%'.format(100 * p.get_height()/total)
x = p.get_x() + p.get_width() / 2 - 0.2
y = p.get_y() + p.get_height()
ax.annotate(percentage, (x, y), size = 12)
data["neighbourhood"].isin(data["neighbourhood"].value_counts()[data["neighbourhood"].value_counts()>30].index)
0 True
1 True
2 True
3 True
4 True
...
16601 True
16602 True
16603 False
16604 True
16605 True
Name: neighbourhood, Length: 16606, dtype: bool
fig, axs = plt.subplots(2, 2, figsize = (14, 14))
axs = axs.flatten()
for i, col in enumerate(nonumerical):
# Dla zmiennej neighbourhood wybieramy tylko te co mają nie mniej niż 200 występowań w ramce
if col == "neighbourhood":
series = data[col]
counts = series.value_counts()
series = series[series.isin(counts[counts>200].index)]
fig = sns.countplot(x = series, ax = axs[i], color="brown", order=series.value_counts().index)
fig.set_xticklabels(fig.get_xticklabels(), rotation=45, horizontalalignment='right')
axs[i].set(title = "Countplot of " + col)
else:
sns.countplot(data = data, x = col, ax = axs[i], color="brown", order=data[col].value_counts().index).set(title = "Countplot of " + col)
without_hue(axs[i], data[col])
plt.tight_layout()
plt.show()
# Zebrałem tu zmiany, które w następnych KM będzie można już od razu łatwiej implementować
# Trzeba pamiętac, że jeszcze był zmapowany license
def bath_mapper(a):
if a == "Not specified":
return 0
else:
return int(float(a))
def bedroom_mapper(a): # Uznałem, że posiadanie studio (czyli całe mieszkanie w jednym większym pokoju) to tak jakby była jedna sypialnia
if a == "Studio":
return 1
else:
return int(a)
def rating_mapper(a): # na razie tak może zostać
if a == "New " or a == "No rating":
return np.nan
else:
return float(a)
data["baths"] = data["baths"].map(bath_mapper)
data["bedrooms"] = data["bedrooms"].map(bedroom_mapper)
data["rating"] = data["rating"].map(rating_mapper)
data["last_review"] = data["last_review"].map(np.datetime64) # Zmapowałem też date ostatniej opinii do przeznaczonego do tego typu
#data = data.drop(["name", "host_id", "host_name"], axis = 1) -> zrobilismy to wyzej + usunelismy id
data
| neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | number_of_reviews_ltm | license | rating | bedrooms | beds | baths | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Queens | Ridgewood | 40.702620 | -73.898000 | Entire home/apt | 471.0 | 30 | 1 | 2022-11-03 | 0.07 | 9 | 365 | 0 | 0 | NaN | 5 | 8 | 2 |
| 1 | Brooklyn | Williamsburg | 40.720270 | -73.955130 | Entire home/apt | 225.0 | 30 | 52 | 2023-09-13 | 0.49 | 5 | 90 | 7 | 0 | 4.60 | 2 | 2 | 1 |
| 2 | Brooklyn | Bedford-Stuyvesant | 40.686700 | -73.948560 | Private room | 80.0 | 30 | 101 | 2023-11-17 | 1.00 | 2 | 363 | 12 | 0 | 4.93 | 1 | 1 | 1 |
| 3 | Manhattan | East Village | 40.732210 | -73.986890 | Entire home/apt | 83.0 | 30 | 2 | 2023-08-11 | 0.12 | 30 | 106 | 1 | 0 | NaN | 1 | 1 | 1 |
| 4 | Brooklyn | South Slope | 40.668600 | -73.987230 | Entire home/apt | 90.0 | 30 | 479 | 2022-05-31 | 2.90 | 3 | 188 | 0 | 0 | 4.93 | 1 | 1 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16601 | Brooklyn | Bedford-Stuyvesant | 40.686001 | -73.943693 | Entire home/apt | 135.0 | 30 | 15 | 2023-01-10 | 0.97 | 1 | 262 | 1 | 0 | 4.87 | 3 | 3 | 1 |
| 16602 | Queens | Corona | 40.745100 | -73.864880 | Private room | 68.0 | 30 | 16 | 2023-10-16 | 2.68 | 2 | 180 | 16 | 0 | 4.69 | 1 | 1 | 1 |
| 16603 | Bronx | Allerton | 40.867770 | -73.860180 | Entire home/apt | 100.0 | 30 | 30 | 2023-10-07 | 2.21 | 5 | 365 | 24 | 0 | 4.73 | 1 | 1 | 1 |
| 16604 | Manhattan | Harlem | 40.830791 | -73.949509 | Private room | 50.0 | 30 | 19 | 2023-12-10 | 0.21 | 4 | 42 | 1 | 0 | 4.84 | 1 | 1 | 1 |
| 16605 | Manhattan | Midtown | 40.762250 | -73.978280 | Entire home/apt | 130.0 | 30 | 2 | 2023-06-01 | 0.20 | 5 | 365 | 2 | 0 | NaN | 1 | 1 | 1 |
16606 rows × 18 columns
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 16606 entries, 0 to 16605 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 neighbourhood_group 16606 non-null object 1 neighbourhood 16606 non-null object 2 latitude 16606 non-null float64 3 longitude 16606 non-null float64 4 room_type 16606 non-null object 5 price 16606 non-null float64 6 minimum_nights 16606 non-null int64 7 number_of_reviews 16606 non-null int64 8 last_review 16606 non-null datetime64[ns] 9 reviews_per_month 16606 non-null float64 10 calculated_host_listings_count 16606 non-null int64 11 availability_365 16606 non-null int64 12 number_of_reviews_ltm 16606 non-null int64 13 license 16606 non-null int64 14 rating 13606 non-null float64 15 bedrooms 16606 non-null int64 16 beds 16606 non-null int64 17 baths 16606 non-null int64 dtypes: datetime64[ns](1), float64(5), int64(9), object(3) memory usage: 2.3+ MB
teraz wszystko jest w dobrych typach :)
# Nic interesującego nie widać
plt.figure(figsize = (10, 10))
sns.heatmap(data.corr(), cmap = 'coolwarm', center = 0, vmax = 1, vmin = -1).set(title = "Correlation heatmap (whole df)")
plt.show()
C:\Users\barto\AppData\Local\Temp\ipykernel_4412\2243212398.py:3: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. sns.heatmap(data.corr(), cmap = 'coolwarm', center = 0, vmax = 1, vmin = -1).set(title = "Correlation heatmap (whole df)")
nic interesujacego tu nei mozemy zauwazyc
# Jeżeli usuniemy "nienormalne ceny" to mamy trochę lepszą wizję (ale dalej wsm nic)
plt.figure(figsize = (10, 10))
sns.heatmap(data[data["price"] <= np.quantile(data["price"], 0.95)].corr(), cmap = 'coolwarm', center = 0, vmax = 1, vmin = -1).set(title = "Correlation heatmap (without upper 5% prices)")
plt.show()
C:\Users\barto\AppData\Local\Temp\ipykernel_4412\2299295370.py:3: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. sns.heatmap(data[data["price"] <= np.quantile(data["price"], 0.95)].corr(), cmap = 'coolwarm', center = 0, vmax = 1, vmin = -1).set(title = "Correlation heatmap (without upper 5% prices)")
Wyobrażając sobie sytuację, że robimy projekt z budowaniem modelu zajmującego się regresją cen... Poza tym te pomysły na wizualiację od razu się nasuwają
numerical = ['minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'number_of_reviews_ltm', 'license', 'rating','bedrooms', 'beds', 'baths']
np.quantile(data["price"], 0.95)
430.0
# Do plotowania usuwam 5% obserwacji górnych price (dalej będe się głównie tą ramką posługiwał)
price_df = data[data["price"] <= np.quantile(data["price"], 0.95)]
# No cóż punktów jest za dużo, aby sens miały scatter ploty, trzeba zrobić jakąś reprezentacje na heatmapach
fig, axs = plt.subplots(6, 2, figsize = (10, 30))
axs = axs.flatten()
for i, col in enumerate(numerical):
sns.scatterplot(data = price_df, y = "price", x = col, ax = axs[i], hue = "neighbourhood_group")
plt.tight_layout()
plt.show()
# Plot zapożyczony z poprzedniego projektu (W tym wypadku wpływ połączenia dwóch zmiennych numerycznych na cenę)
# Im ciemniejszy kwadrat tym wyższa jest średnia cena obserwacji danego kwadratu.
# Im bardziej w prawo lub im bardziej w górę tym wyższa wartość odpowiedniej zmiennej
# Wykres warto pobrać i tam sprawdzać jak wygląda na przybliżeniu
bins = 10
fig, axs = plt.subplots(10, 10, figsize = (50, 50))
fig.subplots_adjust(hspace = 0.4)
for i, col1 in enumerate(numerical):
for j, col2 in enumerate(numerical):
if j < i:
df = pd.DataFrame({col1: pd.qcut(price_df[col1], q = bins, duplicates = "drop"),\
col2: pd.qcut(price_df[col2], q = bins, duplicates = "drop"),\
"price": price_df["price"].copy()})
annot = df.groupby([col1, col2]).count().reset_index().pivot(index=col1, columns=col2, values="price").sort_index(ascending = False)
df = df.groupby([col1, col2]).mean().reset_index().pivot(index=col1, columns=col2, values="price").sort_index(ascending = False)
fig = sns.heatmap(df, cmap = sns.cm.rocket_r, ax = axs[i - 1][j], annot = annot, fmt='g')
fig.tick_params(labelleft=False, left=False, labelbottom=False, bottom=False)
plt.tight_layout()
plt.show()
plt.figure(figsize = (10, 10))
sns.boxplot(data = price_df, y = "price", x = "beds", hue = "neighbourhood_group")
plt.show()
df = price_df[["price", "neighbourhood_group", 'room_type']].groupby(["neighbourhood_group", 'room_type']).mean().reset_index()
plt.figure(figsize = (10, 10))
ax = sns.barplot(data = df, y = "price", x = "neighbourhood_group", hue = "room_type")
ax.set(ylabel = "avg price")
plt.show()
df = price_df[["price", "neighbourhood_group", 'license']].groupby(["neighbourhood_group", 'license']).mean().reset_index()
plt.figure(figsize = (10, 10))
ax = sns.barplot(data = df, y = "price", x = "neighbourhood_group", hue = "license")
ax.set(ylabel = "avg price")
plt.show()
plt.figure(figsize = (10, 10))
ax = sns.boxplot(data = price_df, y = "price", x = pd.qcut(price_df["reviews_per_month"], q = 10))
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.show()
plt.figure(figsize = (10, 10))
ax = sns.boxplot(data = price_df, y = "price", x = pd.cut(price_df["minimum_nights"], bins = 3))
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.show()
plt.figure(figsize = (10, 10))
sns.boxplot(data = price_df, y = "rating", x = "neighbourhood_group")
plt.ylim([3.9,5.05])
plt.show()
plt.figure(figsize = (10, 10))
ax = sns.boxplot(data = price_df, y = "rating", x = pd.qcut(price_df["price"], q = 10))
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.ylim([3.9,5.05])
plt.show()
plt.figure(figsize = (10, 10))
ax = sns.boxplot(data = price_df, y = "rating", x = pd.qcut(price_df["price"], q = 10), hue = "neighbourhood_group")
sns.move_legend(ax, 'lower right')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.ylim([3.9,5.05])
plt.show()
plt.figure(figsize = (10, 10))
ax = sns.boxplot(data = price_df, y = "rating", x = pd.qcut(price_df["reviews_per_month"], q = 10))
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.ylim([3.9,5.05])
plt.show()
import plotly.express as px
Gdzie sa konkretne dzielnice w NY?
fig = px.scatter_mapbox(data,
lat="latitude", lon="longitude", color="neighbourhood_group",
zoom=10, height=800)
fig.update_layout(mapbox_style="carto-positron")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
Czy sąsiedztwa to takie "poddzielnice"?
fig = px.scatter_mapbox(data,
lat="latitude", lon="longitude", color="neighbourhood",
zoom=10, height=800)
fig.update_layout(mapbox_style="carto-positron")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_layout(showlegend = True)
fig.show()
Jak wygląda rozkład cen w NY? (patrzymy bez górnych 5%)
# Problem w tym, że im większe skupisko ogłoszeń w danym miejsu tym jaśniejsza jest obserwacja, dlatego plot uznaję za bezsensowny
# Wystarczy pobawić się parametrem radius
fig = px.density_mapbox(price_df,
lat="latitude", lon="longitude", z="price", radius = 2,
zoom=10, height=800)
fig.update_layout(mapbox_style="carto-positron")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
price_df.groupby("neighbourhood").count().reset_index()
| neighbourhood | neighbourhood_group | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | number_of_reviews_ltm | license | rating | bedrooms | beds | baths | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Allerton | 28 | 28 | 28 | 28 | 28 | 28 | 28 | 28 | 28 | 28 | 28 | 28 | 28 | 26 | 28 | 28 | 28 |
| 1 | Arden Heights | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 |
| 2 | Arrochar | 11 | 11 | 11 | 11 | 11 | 11 | 11 | 11 | 11 | 11 | 11 | 11 | 11 | 8 | 11 | 11 | 11 |
| 3 | Arverne | 48 | 48 | 48 | 48 | 48 | 48 | 48 | 48 | 48 | 48 | 48 | 48 | 48 | 43 | 48 | 48 | 48 |
| 4 | Astoria | 274 | 274 | 274 | 274 | 274 | 274 | 274 | 274 | 274 | 274 | 274 | 274 | 274 | 244 | 274 | 274 | 274 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 214 | Windsor Terrace | 41 | 41 | 41 | 41 | 41 | 41 | 41 | 41 | 41 | 41 | 41 | 41 | 41 | 35 | 41 | 41 | 41 |
| 215 | Woodhaven | 55 | 55 | 55 | 55 | 55 | 55 | 55 | 55 | 55 | 55 | 55 | 55 | 55 | 45 | 55 | 55 | 55 |
| 216 | Woodlawn | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 5 | 6 | 6 | 6 |
| 217 | Woodrow | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 1 | 3 | 3 | 3 |
| 218 | Woodside | 141 | 141 | 141 | 141 | 141 | 141 | 141 | 141 | 141 | 141 | 141 | 141 | 141 | 105 | 141 | 141 | 141 |
219 rows × 18 columns
# Pogrupujmy więc dane na te poddzielnice
# Wyliczymy średnią po wszystkich cechach (zmienne dotyczące położenia powinny wskazać środek tych poddzielnic)
df = price_df.groupby("neighbourhood").mean().reset_index()
df["counts"] = price_df.groupby("neighbourhood").count().reset_index()["latitude"]
fig = px.scatter_mapbox(df,
lat="latitude", lon="longitude",
size = "price", color = "rating",
hover_name = "neighbourhood", hover_data = ["price", "rating", "counts"],
zoom=10, height=800, color_continuous_scale="bluered")
fig.update_layout(mapbox_style="carto-positron")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_layout(showlegend = False)
fig.show()
C:\Users\barto\AppData\Local\Temp\ipykernel_4412\3652456222.py:3: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
# Które dzielnice na topie? (mają największy rate opinii na miesiąc za poprzedni rok)
df = price_df.groupby("neighbourhood").median().reset_index()
df["counts"] = price_df.groupby("neighbourhood").count().reset_index()["latitude"]
fig = px.scatter_mapbox(df,
lat="latitude", lon="longitude",
size = "number_of_reviews_ltm", color = "rating",
hover_name = "neighbourhood", hover_data = ["number_of_reviews_ltm", "rating", "counts"],
zoom=10, height=800, color_continuous_scale="bluered")
fig.update_layout(mapbox_style="carto-positron")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_layout(showlegend = False)
fig.show()
C:\Users\barto\AppData\Local\Temp\ipykernel_4412\4160348705.py:2: FutureWarning: The default value of numeric_only in DataFrameGroupBy.median is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.